import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore
df=pd.read_csv("WineQT.csv")
df
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 | 1592 |
| 1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 | 1593 |
| 1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 | 1594 |
| 1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 | 1595 |
| 1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 | 1597 |
1143 rows × 13 columns
df.drop('Id',axis=1,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1143 entries, 0 to 1142 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1143 non-null float64 1 volatile acidity 1143 non-null float64 2 citric acid 1143 non-null float64 3 residual sugar 1143 non-null float64 4 chlorides 1143 non-null float64 5 free sulfur dioxide 1143 non-null float64 6 total sulfur dioxide 1143 non-null float64 7 density 1143 non-null float64 8 pH 1143 non-null float64 9 sulphates 1143 non-null float64 10 alcohol 1143 non-null float64 11 quality 1143 non-null int64 dtypes: float64(11), int64(1) memory usage: 107.3 KB
df.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
sns.pairplot(df,hue='quality')
<seaborn.axisgrid.PairGrid at 0x26ad170bee0>
plt.figure(figsize=(14,6))
corr=df.corr(method='pearson')
heatmap=sns.heatmap(corr,annot=True,vmax=1,vmin=-1,linewidths=1,linecolor='White')
plt.show()
for i in df.columns:
sns.distplot(df[i])
plt.show()
for i in df.columns:
sns.boxplot(df[i])
plt.show()
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
df[['sulphates', 'alcohol', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide']]=np.log(df[['sulphates', 'alcohol','residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide']])
df.drop(df[df['citric acid']==0.00].index,inplace=True)
df['citric acid'].value_counts()
0.49 47
0.24 42
0.02 35
0.01 26
0.26 26
..
0.79 1
0.72 1
0.62 1
0.75 1
1.00 1
Name: citric acid, Length: 76, dtype: int64
df.apply(zscore)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | -0.363115 | 1.429920 | -1.359801 | -0.079212 | 0.364336 | 0.255610 | 0.529825 | 0.097443 | -0.245963 | 0.038493 | -0.578601 | -0.834077 | -1.732770 |
| 3 | 1.576848 | -1.378363 | 1.426103 | -0.614008 | -0.254987 | 0.438930 | 0.679715 | 0.624490 | -0.911038 | -0.466849 | -0.578601 | 0.397573 | -1.730598 |
| 6 | -0.306057 | 0.493826 | -1.252651 | -1.095047 | -0.507753 | 0.255610 | 0.655805 | -0.218785 | 0.020067 | -1.494889 | -0.998648 | -0.834077 | -1.724083 |
| 8 | -0.363115 | 0.376814 | -1.466952 | -0.470430 | -0.336923 | -0.492571 | -1.033108 | -0.007966 | 0.419112 | -0.543982 | -0.891984 | 1.629223 | -1.719740 |
| 9 | -0.990750 | 0.376814 | -1.145501 | -0.765352 | 0.524767 | 0.255610 | 0.793588 | -0.482309 | -0.112948 | -0.783770 | -1.215424 | -0.834077 | -1.715397 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | -1.218981 | -0.032727 | -0.877626 | -0.079212 | -0.214835 | 1.221174 | 0.102883 | -0.566636 | 0.818156 | 0.673145 | 0.585731 | 0.397573 | 1.719990 |
| 1139 | -0.933692 | 0.610838 | -1.145501 | -0.614008 | -0.552008 | 1.169778 | 0.029911 | -0.160810 | 0.818156 | 1.068885 | -0.891984 | 0.397573 | 1.722161 |
| 1140 | -1.276038 | 0.493826 | -1.145501 | -0.470430 | 0.297709 | 1.365355 | 0.238476 | -1.009356 | 1.017679 | -0.466849 | 0.116824 | -0.834077 | 1.724333 |
| 1141 | -1.447212 | 0.201296 | -1.038351 | -0.203640 | -0.832032 | 1.655100 | 0.448509 | -0.893405 | 1.483231 | 0.731888 | 0.767352 | 0.397573 | 1.726505 |
| 1142 | -1.447212 | 0.757102 | -0.931201 | -0.470430 | -0.254987 | 1.365355 | 0.238476 | -0.708939 | 1.815768 | 0.430071 | -0.175360 | -0.834077 | 1.730848 |
1044 rows × 13 columns
bins=(2,6.5,8)
group_names=['bad','good']
df['quality']=pd.cut(df['quality'],bins=bins,labels=group_names)
df['quality']=df['quality'].map({'bad':0,'good':1})
df['quality'].value_counts().plot.pie(autopct="%.2f%%")
<AxesSubplot:ylabel='quality'>
df.head(5)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 7.8 | 0.76 | 0.04 | 0.832909 | -2.385967 | 2.708050 | 3.988984 | 0.9970 | 3.26 | -0.430783 | 2.282382 | 0 |
| 3 | 11.2 | 0.28 | 0.56 | 0.641854 | -2.590267 | 2.833213 | 4.094345 | 0.9980 | 3.16 | -0.544727 | 2.282382 | 0 |
| 6 | 7.9 | 0.60 | 0.06 | 0.470004 | -2.673649 | 2.708050 | 4.077537 | 0.9964 | 3.30 | -0.776529 | 2.240710 | 0 |
| 8 | 7.8 | 0.58 | 0.02 | 0.693147 | -2.617296 | 2.197225 | 2.890372 | 0.9968 | 3.36 | -0.562119 | 2.251292 | 1 |
| 9 | 6.7 | 0.58 | 0.08 | 0.587787 | -2.333044 | 2.708050 | 4.174387 | 0.9959 | 3.28 | -0.616186 | 2.219203 | 0 |
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression
X=df.iloc[:,:-1]
y=df['quality']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=123)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(835, 11) (835,) (209, 11) (209,)
lr=LinearRegression()
lr.fit(X_train,y_train)
LinearRegression()
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)
print('RMSE',round(np.sqrt(metrics.mean_squared_error(y_train,y_train_pred)),2))
print('R2 SCORE',metrics.r2_score(y_train,y_train_pred))
RMSE 0.3 R2 SCORE 0.2564554498612728
print('RMSE',round(np.sqrt(metrics.mean_squared_error(y_test,y_test_pred)),2))
print('R2 SCORE',metrics.r2_score(y_test,y_test_pred))
RMSE 0.31 R2 SCORE 0.30266733125135226
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
X=df.iloc[:,:-1]
y=df['quality']
kn=KNeighborsClassifier()
kn.fit(X_train,y_train)
KNeighborsClassifier()
y_train_pred=kn.predict(X_train)
y_test_pred=kn.predict(X_test)
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.911377245508982 0.8564593301435407
from sklearn.model_selection import cross_val_score
k_neighbors=list(range(1,50,2))
cv_scores=[]
for k in k_neighbors:
knn=KNeighborsClassifier(n_neighbors=k)
scores=cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy')
cv_scores.append((scores.mean()))
mse=[1-x for x in cv_scores]
mse.index
<function list.index(value, start=0, stop=9223372036854775807, /)>
optimal_k=k_neighbors[mse.index(min(mse))]
optimal_k
1
from sklearn.metrics import accuracy_score,classification_report,recall_score,precision_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
lor=LogisticRegression()
lor.fit(X_train,y_train)
LogisticRegression()
y_train_pred=lor.predict(X_train)
y_test_pred=lor.predict(X_test)
y_train_proba=lor.predict_proba(X_train)
y_test_proba=lor.predict_proba(X_test)
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.8658682634730539 0.8516746411483254
print(confusion_matrix(y_train,y_train_pred))
print(confusion_matrix(y_test,y_test_pred))
[[701 14] [ 98 22]] [[174 0] [ 31 4]]
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 0.88 0.98 0.93 715
1 0.61 0.18 0.28 120
accuracy 0.87 835
macro avg 0.74 0.58 0.60 835
weighted avg 0.84 0.87 0.83 835
precision recall f1-score support
0 0.85 1.00 0.92 174
1 1.00 0.11 0.21 35
accuracy 0.85 209
macro avg 0.92 0.56 0.56 209
weighted avg 0.87 0.85 0.80 209
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
DecisionTreeClassifier()
y_train_pred=dt.predict(X_train)
y_test_pred=dt.predict(X_test)
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 1.00 1.00 1.00 715
1 1.00 1.00 1.00 120
accuracy 1.00 835
macro avg 1.00 1.00 1.00 835
weighted avg 1.00 1.00 1.00 835
precision recall f1-score support
0 0.91 0.92 0.91 174
1 0.58 0.54 0.56 35
accuracy 0.86 209
macro avg 0.74 0.73 0.74 209
weighted avg 0.85 0.86 0.85 209
dt1=DecisionTreeClassifier(max_depth=10)
dt1.fit(X_train,y_train)
DecisionTreeClassifier(max_depth=10)
y_train_pred=dt1.predict(X_train)
y_test_pred=dt1.predict(X_test)
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 0.99 1.00 1.00 715
1 0.97 0.97 0.97 120
accuracy 0.99 835
macro avg 0.98 0.98 0.98 835
weighted avg 0.99 0.99 0.99 835
precision recall f1-score support
0 0.90 0.92 0.91 174
1 0.55 0.49 0.52 35
accuracy 0.85 209
macro avg 0.72 0.70 0.71 209
weighted avg 0.84 0.85 0.84 209
fig,ax=plt.subplots(figsize=(10,10))
chart=plot_tree(dt1,max_depth=3,feature_names=X.columns,filled=True,fontsize=10)